########################################
########################################
## 02.1-Initial_GPFI.qmd ##
## Jacob Johnson ##
## ---------------------------------- ##
## Quarto document to summarize the ##
## feature importance results from ##
## the models with the best RMSE from ##
## the tuning done in 01 ##
## - Loads feature importance results ##
## - Plots feature importance ##
########################################
########################################Intial Grouped Feature Importance
Group Feature Importance from Intial Tuned Models
Set Up
## Load packages
library(dplyr)
library(ggplot2)
library(purrr)
library(stringr)
library(gridExtra)
library(tidyr)
library(wesanderson)Specify file path:
## Path for working locally
local_fp = "//cee/projects/subseasonal_extreme/"
## Path for working on the CEE Compute Server
cee_fp = "/projects/subseasonal_extreme/"
## Select which path to use
fp = local_fpData Steps
Load climate inputs:
## Variables identified by Maike as redundant
redundant_cols <-
c(
'slp_weekave_atlantic_ocean_mean',
'h_850_weekave_conus_mean',
'h_850_weekave_atlantic_ocean_pc4',
'h_850_weekave_mexico_gulf_mean',
'slp_weekave_mexico_gulf_mean',
'slp_weekave_conus_pc1',
'ts_weekave_conus_pc1',
'slp_weekave_atlantic_trop_mean',
'ts_weekave_southern_canada_pc1',
't_850_weekave_southern_canada_mean',
'h_850_weekave_atlantic_ocean_pc5',
'slp_weekave_atlantic_ocean_pc6',
'h_500_weekave_pacific_trop_mean',
'h_850_weekave_atlantic_trop_mean',
't_500_weekave_pacific_trop_mean',
't_500_weekave_atlantic_trop_pc1',
'h_850_weekave_pacific_ocean_pc1',
't_850_weekave_conus_pc1',
'slp_weekave_atlantic_ocean_pc1',
'slp_weekave_pacific_trop_mean',
't_850_weekave_southern_canada_pc1',
'h_850_weekave_arctic_pc1',
'h_500_weekave_atlantic_trop_mean',
'h_850_weekave_atlantic_ocean_pc2',
'slp_weekave_atlantic_ocean_pc3',
'h_850_weekave_atlantic_ocean_pc7',
't_500_weekave_mexico_gulf_mean',
't_500_weekave_southern_canada_pc1',
'slp_weekave_pacific_trop_pc2',
'h_850_weekave_arctic_mean',
'ts_weekave_atlantic_trop_mean',
'h_850_weekave_pacific_trop_mean'
)
climate_inputs <-
readr::read_csv(
paste0(fp, "analysis-data/weekly_aves_regional_inputs_1980_2020.csv"),
show_col_types = FALSE
) |>
mutate(date = as.character(date)) |>
separate(date, into = c("year", "week"), sep = 4, remove = FALSE) |>
mutate(
year = as.numeric(year),
week = as.numeric(week)
) |>
select(-all_of(redundant_cols))Prepared variable names to attach to feature importance results (warning is okay and is due to separation of variable with different lengths):
input_vars <-
data.frame(var = c(names(climate_inputs)[-c(1:3)], "temp")) |>
mutate(var = str_remove(var, "weekave_")) |>
separate(
var,
into = c("var1", "var2", "var3", "var4", "var5"),
remove = FALSE
) |>
rename(climate_var = var1) |>
mutate(
climate_var = ifelse(
var2 %in% c("500", "850"),
paste0(climate_var, var2),
climate_var
)) |>
mutate(
var2 = ifelse(
var2 %in% c("500", "850"),
NA,
var2
)) |>
rename(input_region = var2) |>
mutate(
var6 = ifelse(
str_detect(var3, "pc"),
var3,
ifelse(
str_detect(var3, "mean"),
var3,
NA
)
)
) |>
mutate(
var3 = ifelse(
str_detect(var3, "pc"),
NA,
ifelse(
str_detect(var3, "mean"),
NA,
var3
)
)
) |>
mutate(
input_region = ifelse(
is.na(var3),
input_region,
ifelse(
is.na(input_region),
var3,
paste0(input_region, "_", var3)
)
)) |>
select(-var3) |>
mutate(
var7 = ifelse(
str_detect(var4, "pc"),
var4,
ifelse(
str_detect(var4, "mean"),
var4,
NA
)
)
) |>
mutate(
var4 = ifelse(
str_detect(var4, "pc"),
NA,
ifelse(
str_detect(var4, "mean"),
NA,
var4
)
)
) |>
mutate(
input_region = ifelse(
is.na(var4),
input_region,
ifelse(
is.na(input_region),
var4,
paste0(input_region, "_", var4)
)
)) |>
select(-var4) |>
rename(stat = var5) |>
mutate(
stat = ifelse(
is.na(stat) & is.na(var6),
var7,
ifelse(
is.na(stat) & is.na(var7),
var6,
stat
)
)) |>
select(-var6, -var7) |>
mutate(
input_region = ifelse(is.na(input_region), "temp", input_region),
stat = ifelse(is.na(stat), "temp", stat)
) |>
mutate(var_id = 1:n()) |>
select(var_id, everything())Warning: Expected 5 pieces. Missing pieces filled with `NA` in 531 rows [1, 3, 12, 13,
14, 15, 16, 17, 20, 24, 27, 32, 33, 34, 35, 36, 37, 38, 39, 40, ...].
FI Results
Determine files containing ESN feature importance results:
fi_fp = paste0(fp, "agu-manuscript-code/EESNs/02-grouped-feature-importance/results/GPFI-nreps01/")
fi_files = list.files(fi_fp)
fi_files = fi_files[str_detect(fi_files, "targetregion")]Load in groups from Maike:
`%ni%` <- Negate(`%in%`)
clusters <- readr::read_csv(paste0(
fp, "agu-manuscript-code/EESNs/02-grouped-feature-importance/02.1-InitialFeatureImportance/All_GPFI_Merged.csv")) |>
select(Group, Feature, Region, Horizon) |>
mutate(middle = substr(Feature, 3, 6)) |>
filter(middle %ni% c("_lag", "lag_")) |>
select(-middle)Rows: 22160 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Feature, Region
dbl (5): Group, Mean_Importance, Std_Importance, Group_Size, Horizon
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Join Feature Importance with Groups
input_vars_with_groups <-
clusters |>
mutate(var = str_remove_all(Feature, "_weekave")) |>
rename(tau = Horizon,
region = Region) |>
left_join(input_vars, by="var") |>
select(-Feature, -var_id)Load ESN feature importance results:
fi_df <-
set_names(fi_files) |>
map(
.f = function(x)
read.csv(paste0(fi_fp, x)) |> mutate(Group = 1:n())
) |>
list_rbind(names_to = "file") |>
mutate(file = str_remove(file, ".csv")) |>
separate(file, into = c("region", "tau", "m", "nh", "nu")) |>
mutate(
region = str_remove(region, "targetregion"),
tau = as.numeric(str_remove(tau, "tau")),
m = as.numeric(str_remove(m, "m")),
nh = as.numeric(str_remove(nh, "nh")),
nu = as.numeric(str_remove(nu, "nu")),
fi = -fi
) |>
left_join(input_vars_with_groups, by = c("Group", "tau", "region")) |>
filter(!is.na(fi)) |>
select(region:nu, Group:stat, fi)Create ordered version of feature importance:
fi_ordered <-
fi_df |>
group_by(region, tau) |>
arrange(region, tau, desc(fi)) |>
mutate(rank = dense_rank(desc(fi))) |>
ungroup() Save data:
write.csv(
fi_ordered,
"../results/gpfi-initial.csv",
row.names = FALSE
)Most important Groups by Region and Horizon
## Subtitles
subtitles <- fi_df |>
summarise(m = mean(m),
nu = mean(nu),
nh = mean(nh),
.by = c(region, tau)) |>
mutate(st = paste0("m:", m, " nu:", nu, " nh:", nh))
## Vars in Groups
groupvars <- fi_df |>
group_by(region, tau, Group) |>
summarise(groupvars = paste(var, collapse = ",\n")) `summarise()` has grouped output by 'region', 'tau'. You can override using the
`.groups` argument.
## Ordered df
fi_ave_ordered <-
fi_ordered |>
summarise(
fi_ave = mean(fi),
fi_min = min(fi),
fi_max = max(fi),
.by = c(region, tau, Group)
) |>
group_by(region, tau) |>
arrange(tau, desc(fi_ave)) |>
mutate(rank = dense_rank(desc(fi_ave))) |>
ungroup() |>
left_join(subtitles, by=c("region", "tau")) |>
left_join(groupvars, by=c("region", "tau", "Group"))
## Max FI
fi_max <- fi_ave_ordered |>
na.omit() |>
group_by(region) |>
summarise(plot_max = max(fi_ave, na.omit=F))MW
NE
SE
SW
W
Averaged (over forecast region)
Compute average FI for each forecast horizon and input variable (averaged over forecast region):
fi_ave_ordered <-
fi_ordered |>
summarise(
fi_ave = mean(fi),
fi_min = min(fi),
fi_max = max(fi),
.by = c(tau, var)
) |>
group_by(tau) |>
arrange(tau, desc(fi_ave)) |>
mutate(rank = row_number()) |>
ungroup()Extract the 25 variables with the highest average feature importance (for each horizon):
fi_ave_ordered_sub1 <-
fi_ave_ordered |>
filter(rank <= 25, tau == 1) |>
mutate(rank = factor(rank, levels = 25:1))
fi_ave_ordered_sub2 <-
fi_ave_ordered |>
filter(rank <= 25, tau == 2) |>
mutate(rank = factor(rank, levels = 25:1))
fi_ave_ordered_sub3 <-
fi_ave_ordered |>
filter(rank <= 25, tau == 3) |>
mutate(rank = factor(rank, levels = 25:1))
fi_ave_ordered_sub4 <-
fi_ave_ordered |>
filter(rank <= 25, tau == 4) |>
mutate(rank = factor(rank, levels = 25:1))Plots of average FI plus/minus minimum and maximum FI values (averaged over forecast region):
Averaged (over forecast region, input region, and stat)
Compute average FI by forecast horizon and input climate variable:
fi_ordered_ave_by_tau_climvar <-
fi_ordered |>
summarise(
fi_ave = mean(fi),
fi_min = min(fi),
fi_max = max(fi),
.by = c(tau, climate_var)
) |>
group_by(tau) |>
arrange(tau, desc(fi_ave)) |>
mutate(rank = row_number()) |>
ungroup()Extract the 25 variables with the highest average feature importance (for each horizon):
fi_ordered_ave_by_tau_climvar_sub1 <-
fi_ordered_ave_by_tau_climvar |>
filter(rank <= 25, tau == 1) |>
mutate(rank = factor(rank, levels = 25:1))
fi_ordered_ave_by_tau_climvar_sub2 <-
fi_ordered_ave_by_tau_climvar |>
filter(rank <= 25, tau == 2) |>
mutate(rank = factor(rank, levels = 25:1))
fi_ordered_ave_by_tau_climvar_sub3 <-
fi_ordered_ave_by_tau_climvar |>
filter(rank <= 25, tau == 3) |>
mutate(rank = factor(rank, levels = 25:1))
fi_ordered_ave_by_tau_climvar_sub4 <-
fi_ordered_ave_by_tau_climvar |>
filter(rank <= 25, tau == 4) |>
mutate(rank = factor(rank, levels = 25:1))Plots of average FI plus/minus minimum and maximum FI values (averaged over forecast region):
FI by Variable
Input region:
Statistic:
Forecast region:
Climate variable: